In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
from lazypredict.Supervised import LazyRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import mean_absolute_error

Data Wrangling¶

In [2]:
path = 'Datasets/Sleep.csv'
df = pd.read_csv(path)
In [3]:
df.head()
Out[3]:
Person ID Gender Age Occupation Sleep Duration Quality of Sleep Physical Activity Level Stress Level BMI Category Heart Rate Daily Steps Sleep Disorder BP High BP Low
0 1 Male 27 Software Engineer 6.10 6 42 6 Overweight 77 4200 No Disorder 126 83
1 2 Male 28 Doctor 6.20 6 60 8 Normal 75 10000 No Disorder 125 80
2 3 Male 28 Doctor 6.20 6 60 8 Normal 75 10000 No Disorder 125 80
3 4 Male 28 Software Engineer 5.90 4 30 8 Obese 85 3000 Sleep Apnea 140 90
4 5 Male 28 Software Engineer 5.90 4 30 8 Obese 85 3000 Sleep Apnea 140 90
In [4]:
df.tail()
Out[4]:
Person ID Gender Age Occupation Sleep Duration Quality of Sleep Physical Activity Level Stress Level BMI Category Heart Rate Daily Steps Sleep Disorder BP High BP Low
369 370 Female 59 Nurse 8.10 9 75 3 Overweight 68 7000 Sleep Apnea 140 95
370 371 Female 59 Nurse 8.00 9 75 3 Overweight 68 7000 Sleep Apnea 140 95
371 372 Female 59 Nurse 8.10 9 75 3 Overweight 68 7000 Sleep Apnea 140 95
372 373 Female 59 Nurse 8.10 9 75 3 Overweight 68 7000 Sleep Apnea 140 95
373 374 Female 59 Nurse 8.10 9 75 3 Overweight 68 7000 Sleep Apnea 140 95
In [5]:
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 374 entries, 0 to 373
Data columns (total 14 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Person ID                374 non-null    int64  
 1   Gender                   374 non-null    object 
 2   Age                      374 non-null    int64  
 3   Occupation               374 non-null    object 
 4   Sleep Duration           374 non-null    float64
 5   Quality of Sleep         374 non-null    int64  
 6   Physical Activity Level  374 non-null    int64  
 7   Stress Level             374 non-null    int64  
 8   BMI Category             374 non-null    object 
 9   Heart Rate               374 non-null    int64  
 10  Daily Steps              374 non-null    int64  
 11  Sleep Disorder           374 non-null    object 
 12  BP High                  374 non-null    int64  
 13  BP Low                   374 non-null    int64  
dtypes: float64(1), int64(9), object(4)
memory usage: 41.0+ KB
In [6]:
df.columns
Out[6]:
Index(['Person ID', 'Gender', 'Age', 'Occupation', 'Sleep Duration',
       'Quality of Sleep', 'Physical Activity Level', 'Stress Level',
       'BMI Category', 'Heart Rate', 'Daily Steps', 'Sleep Disorder',
       'BP High', 'BP Low'],
      dtype='object')
In [7]:
df.shape
Out[7]:
(374, 14)
In [8]:
df.isnull().sum()
Out[8]:
Person ID                  0
Gender                     0
Age                        0
Occupation                 0
Sleep Duration             0
Quality of Sleep           0
Physical Activity Level    0
Stress Level               0
BMI Category               0
Heart Rate                 0
Daily Steps                0
Sleep Disorder             0
BP High                    0
BP Low                     0
dtype: int64
In [9]:
df.describe().round(2)
Out[9]:
Person ID Age Sleep Duration Quality of Sleep Physical Activity Level Stress Level Heart Rate Daily Steps BP High BP Low
count 374.00 374.00 374.00 374.00 374.00 374.00 374.00 374.00 374.00 374.00
mean 187.50 42.18 7.13 7.31 59.17 5.39 70.17 6816.84 128.55 84.65
std 108.11 8.67 0.80 1.20 20.83 1.77 4.14 1617.92 7.75 6.16
min 1.00 27.00 5.80 4.00 30.00 3.00 65.00 3000.00 115.00 75.00
25% 94.25 35.25 6.40 6.00 45.00 4.00 68.00 5600.00 125.00 80.00
50% 187.50 43.00 7.20 7.00 60.00 5.00 70.00 7000.00 130.00 85.00
75% 280.75 50.00 7.80 8.00 75.00 7.00 72.00 8000.00 135.00 90.00
max 374.00 59.00 8.50 9.00 90.00 8.00 86.00 10000.00 142.00 95.00
In [10]:
df['Gender'].value_counts()
Out[10]:
Gender
Male      189
Female    185
Name: count, dtype: int64
In [11]:
df['Occupation'].value_counts()
Out[11]:
Occupation
Software Engineer    109
Nurse                 73
Doctor                71
Lawyer                47
Accountant            37
Salesperson           32
Scientist              4
Manager                1
Name: count, dtype: int64
In [12]:
df['BMI Category'].value_counts()
Out[12]:
BMI Category
Normal           195
Overweight       148
Normal Weight     21
Obese             10
Name: count, dtype: int64
In [13]:
df.loc[df['BMI Category']=='Normal Weight','BMI Category'] = 'Normal'
df['BMI Category'].value_counts()
Out[13]:
BMI Category
Normal        216
Overweight    148
Obese          10
Name: count, dtype: int64
In [14]:
df['Sleep Disorder'].value_counts()
Out[14]:
Sleep Disorder
No Disorder    219
Sleep Apnea     78
Insomnia        77
Name: count, dtype: int64
In [15]:
df['Sick'] = np.nan

for i in range(len(df)):
    if df.loc[i,'Sleep Disorder'] != 'No Disorder':
        df.loc[i,'Sick'] = 1
    else:
        df.loc[i,'Sick'] = 0

df['Sick'] = df['Sick'].astype(int)
df['Sick'].value_counts()
Out[15]:
Sick
0    219
1    155
Name: count, dtype: int64

Data Visualization¶

Correlation Matrix¶

In [16]:
numeric_df = df.select_dtypes(include=['int64', 'float64'])

# Create a correlation matrix
correlation_matrix = numeric_df.corr()

# Plot the correlation matrix as a heatmap
plt.figure(figsize=(12, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f", linewidths=.5)
plt.title('Correlation Matrix')
plt.show()
No description has been provided for this image

How is the relation between variables and stress level¶

In [17]:
fig = px.scatter(df, x="Age", y="Stress Level", color="Gender")
fig.update_layout(
    title="Age vs Stress Level",
    xaxis_title="Age",
    yaxis_title="Stress Level",
    font=dict(
        size=12
    )
)
fig.show()
In [18]:
fig = px.scatter(df, x='Quality of Sleep', y='Stress Level', color='Gender', marginal_y='box', marginal_x='box')
fig.update_layout(
    title='Quality of Sleep vs Stress Level',
    xaxis_title='Quality of Sleep',
    yaxis_title='Stress Level',
    font = dict(
        size=12
    )
)
fig.show()
In [19]:
fig = px.scatter(df, x='Physical Activity Level', y='Stress Level', color='Gender', marginal_x='box')
fig.update_layout(
    title='Activity vs Stress Level',
    xaxis_title='Activity Level',
    yaxis_title='Stress Level',
    font = dict(
        size=12
    )
)
fig.show()
In [20]:
fig = px.scatter(df, x='Sleep Duration', y='Stress Level', color='Gender')
fig.update_layout(
    title='Sleep Duration vs Stress Level',
    xaxis_title='Sleep Duration',
    yaxis_title='Stress Level',
    font = dict(
        size=12
    )
)
fig.show()
In [21]:
fig = px.scatter(df, x='Stress Level', y=['BP High', 'BP Low'], size='Heart Rate')
fig.update_layout(
    title='Stress Level vs Blood Pressure',
    xaxis_title='Stress Level',
    yaxis_title='Blood Pressure',
    font = dict(
        size=12
    )
)
fig.show()

Demographic Analysis¶

In [22]:
gender = df.groupby('Gender').size().reset_index(name='Count')

fig = px.pie(gender, values='Count', names='Gender',
            hole=.5,
            color_discrete_sequence=px.colors.sequential.RdBu
)
fig.update_layout(
    title='Gender Distribution',
    font = dict(
        size=12
    ),
    width=500,
    title_x=.5
)

# show label name and percentage as a name
fig.update_traces(textposition='inside', textinfo='percent+label')
fig.show()
In [23]:
fig = px.histogram(df, x='Age', nbins=20, color_discrete_sequence=px.colors.sequential.RdBu, color='Gender')
fig.update_traces(opacity=0.7)
fig.update_layout(
    title='Age Distribution',
    xaxis_title='Age',
    yaxis_title=None,
    font = dict(
        size=12
    ),
    width=900
)
fig.show()
In [24]:
occupation_stress_level = df.groupby('Occupation')['Stress Level'].mean().reset_index(name='Stress Level').sort_values(by='Stress Level', ascending=False)

fig = px.bar(occupation_stress_level, x='Occupation', y='Stress Level', color='Occupation', color_discrete_sequence=px.colors.sequential.RdBu)
fig.update_layout(
    title='Occupation vs Stress Level',
    xaxis_title='Occupation',
    yaxis_title='Stress Level',
    font = dict(
        size=12
    ),
    width=900,
    showlegend=False
)
fig.show()
In [25]:
sleep_do_bp = df.groupby('Sleep Disorder')[['BP High', 'BP Low']].mean().reset_index();sleep_do_bp
df_bp = pd.melt(sleep_do_bp, id_vars='Sleep Disorder', var_name='Blood Pressure Type', value_name='Pressure Value');df_bp
Out[25]:
Sleep Disorder Blood Pressure Type Pressure Value
0 Insomnia BP High 132.04
1 No Disorder BP High 124.05
2 Sleep Apnea BP High 137.77
3 Insomnia BP Low 86.86
4 No Disorder BP Low 81.00
5 Sleep Apnea BP Low 92.72
In [26]:
fig = px.bar(
    df_bp,
    x='Sleep Disorder',
    y='Pressure Value',
    color='Blood Pressure Type',
    barmode='group',
    color_discrete_sequence=px.colors.sequential.RdBu
)

fig.update_layout(
    title='Sleep Disorder vs Blood Pressure',
    xaxis_title='Sleep Disorder',
    yaxis_title='Blood Pressure',
    font = dict(
        size=12
    ),
    width=900
)
fig.show()
In [27]:
bmi_bpressure = df.groupby('BMI Category')[['BP High', 'BP Low']].mean().reset_index();bmi_bpressure
bmi_df = pd.melt(bmi_bpressure, id_vars='BMI Category', var_name='Blood Pressure Type', value_name='Pressure Value');bmi_df
Out[27]:
BMI Category Blood Pressure Type Pressure Value
0 Normal BP High 123.61
1 Obese BP High 139.20
2 Overweight BP High 135.05
3 Normal BP Low 80.60
4 Obese BP Low 90.20
5 Overweight BP Low 90.18
In [28]:
fig = px.bar(
    bmi_df,
    x='BMI Category',
    y='Pressure Value',
    color='Blood Pressure Type',
    barmode='group',
    color_discrete_sequence=px.colors.sequential.RdBu
)

fig.update_layout(
    title='BMI Category vs Blood Pressure',
    xaxis_title='BMI Category',
    yaxis_title='Blood Pressure',
    legend_title='Blood Pressure',
    font = dict(
        size=12
    ),
    width=900
)
fig.show()

Pair Plot¶

In [29]:
numeric_df = df.select_dtypes(include=['int64', 'float64'])
import warnings
# Set a custom style
sns.set(style="darkgrid")

# Suppress the specific warning
with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    
    # Create a pairplot with customized style
    sns.pairplot(numeric_df, diag_kind="hist", markers="s", palette="viridis")
    plt.suptitle('Pairplot of Variables', y=1.02)
    plt.show()
No description has been provided for this image

Model Traning¶

In [30]:
X = df.drop(['Person ID', 'Stress Level'], axis=1)
y = df[['Stress Level']]
In [31]:
X.head(3)
Out[31]:
Gender Age Occupation Sleep Duration Quality of Sleep Physical Activity Level BMI Category Heart Rate Daily Steps Sleep Disorder BP High BP Low Sick
0 Male 27 Software Engineer 6.10 6 42 Overweight 77 4200 No Disorder 126 83 0
1 Male 28 Doctor 6.20 6 60 Normal 75 10000 No Disorder 125 80 0
2 Male 28 Doctor 6.20 6 60 Normal 75 10000 No Disorder 125 80 0
In [32]:
from sklearn.preprocessing import LabelEncoder

categoric = ['Gender', 'Occupation', 'BMI Category', 'Sleep Disorder']
le = LabelEncoder()

for column in categoric:
    X[column] = le.fit_transform(X[column])
In [33]:
from sklearn.preprocessing import StandardScaler

numeric = ['Age', 'Sleep Duration', 'Quality of Sleep', 'Physical Activity Level', 'Heart Rate', 'Daily Steps', 'BP High', 'BP Low']
scaler = StandardScaler()

for column in numeric:
    X[column] = scaler.fit_transform(X[column].values.reshape(-1,1))
In [34]:
y.head(3)
Out[34]:
Stress Level
0 6
1 8
2 8
In [35]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=42)
In [36]:
# Create a pipeline for Linear Regression with standardization

linear_model = make_pipeline(StandardScaler(), LinearRegression())
linear_model.fit(X_train, y_train)
linear_predictions = linear_model.predict(X_test)
In [37]:
# Create a pipeline for Random Forest Regression

rf_model = RandomForestRegressor(random_state=42)
rf_model.fit(X_train, y_train)
rf_predictions = rf_model.predict(X_test)
In [38]:
# Evaluate Linear Regression

linear_mse = mean_squared_error(y_test, linear_predictions)
linear_mae = mean_absolute_error(y_test, linear_predictions)
linear_r2 = r2_score(y_test, linear_predictions)
In [39]:
print("Linear Regression:")
print(f"Mean Squared Error: {linear_mse}")
print(f"Mean Absolute Error: {linear_mae}")
print(f"R-squared: {linear_r2}")
Linear Regression:
Mean Squared Error: 0.13157840180849717
Mean Absolute Error: 0.2489498157035418
R-squared: 0.9578850284412884
In [40]:
# Evaluate Random Forest Regression
rf_mse = mean_squared_error(y_test, rf_predictions)
rf_mae = mean_absolute_error(y_test, rf_predictions)
rf_r2 = r2_score(y_test, rf_predictions)
In [41]:
print("\nRandom Forest Regression:")
print(f"Mean Squared Error: {rf_mse}")
print(f"Mean Absolute Error: {rf_mae}")
print(f"R-squared: {rf_r2}")
Random Forest Regression:
Mean Squared Error: 0.027133333333333336
Mean Absolute Error: 0.049066666666666654
R-squared: 0.9913152953226357
In [42]:
import matplotlib.pyplot as plt
import numpy as np

# Names of the models
models = ['Linear Regression', 'Random Forest']

# Metrics
mse_values = [linear_mse, rf_mse]
mae_values = [linear_mae, rf_mae]
r2_values = [linear_r2, rf_r2]

x = np.arange(len(models))  # the label locations
width = 0.28  # the width of the bars

fig, ax = plt.subplots(figsize=(10, 6))

# Plotting
rects1 = ax.bar(x - width, mse_values, width, label='MSE')
rects2 = ax.bar(x, mae_values, width, label='MAE')
rects3 = ax.bar(x + width, r2_values, width, label='R2 Score')

# Labeling and Layout
ax.set_ylabel('Scores')
ax.set_title('Scores by Regression Algorithm and Metric')
ax.set_xticks(x)
ax.set_xticklabels(models)
ax.legend()

# Add values on top of the bars
def add_values(rects):
    for rect in rects:
        height = rect.get_height()
        ax.annotate('{}'.format(round(height, 3)),
                    xy=(rect.get_x() + rect.get_width() / 2, height),
                    xytext=(0, 3),  # 3 points vertical offset
                    textcoords="offset points",
                    ha='center', va='bottom')

add_values(rects1)
add_values(rects2)
add_values(rects3)

fig.tight_layout()
plt.show()
No description has been provided for this image